# ================== Jiangjiagou | 2003–2022 Soil Moisture + Covariates (incl. ρb) + SHAP ==================
suppressPackageStartupMessages({
  library(terra); library(sf); library(stringr)
  library(dplyr); library(blockCV); library(ggplot2)
  library(xgboost); library(Matrix)
  library(purrr)
})

# ------------------ Paths & parameters ------------------
# Sampling points (Jiangjiagou)
pts_fp   <- "E:/JJG/蒋家沟_采样点_像元中心.shp"
# SM root directory (contains 2003–2022 folders)
sm_root_dir  <- "F:/F盘 2023.1.13/2022 文章/2024 全球 全国 土壤水时间序列/中国1km分辨率逐日全天候地表土壤水分数据集 2003-2022"
# Covariates: prefer TWI/DEM under E:/JJG (rename as needed); skip the variable if not found
twi_candidates <- c("E:/JJG/TWI_蒋家沟.tif", "E:/JJG/TWI.tif")
dem_candidates <- c("E:/JJG/DEM_蒋家沟.tif", "E:/JJG/DEM.tif")
# Soils (Jiangjiagou)
soil_dir <- "E:/Soiltexture"
soil_files <- c(
  sand    = file.path(soil_dir, "JJGsand.tif"),
  silt    = file.path(soil_dir, "JJGsilt.tif"),
  clay    = file.path(soil_dir, "JJGclay.tif"),
  density = file.path(soil_dir, "JJGsoildensity.tif")  # Soil bulk density file
)

# Outputs (all written to E:/JJG)
out_dir  <- "E:/JJG"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
sm_out   <- file.path(out_dir, "SM_observed_Jiangjiagou_2003-2022.csv")
stk_out  <- file.path(out_dir, "Jiangjiagou_covariates_stack_2003-2022.tif")
imp_out  <- file.path(out_dir, "XGB_SHAP_importance_Jiangjiagou_2003-2022.csv")
plot_out <- file.path(out_dir, "XGB_SHAP_importance_Jiangjiagou_2003-2022.tif")

# Time range: first day of 2003 to last day of 2022
start_year <- 2003
end_year <- 2022
scale_factor <- 0.001

# ------------------ Read sampling points ------------------
pts <- st_read(pts_fp, quiet = TRUE)
if (!"point_id" %in% names(pts)) pts$point_id <- seq_len(nrow(pts))

# ------------------ Build the list of all TIF files (2003–2022) ------------------
cat("Scanning 2003–2022 soil moisture files...\n")

# Build year-folder paths (format like "2003tif输出")
year_dirs <- file.path(sm_root_dir, sprintf("%04dtif输出", start_year:end_year))

# Check which folders exist
existing_dirs <- year_dirs[file.exists(year_dirs)]
if (length(existing_dirs) == 0) {
  stop("No year folders found. Please check the path format. Expected: '2003tif输出', '2004tif输出', etc.")
}

cat("Found year folders: ", paste(basename(existing_dirs), collapse = ", "), "\n")

# Scan all TIF files
all_files <- character()
all_dates <- character()

for (year_dir in existing_dirs) {
  # Extract year from folder name
  year <- as.integer(substr(basename(year_dir), 1, 4))
  
  tif_files <- list.files(year_dir, pattern = "\\.tif$", full.names = TRUE)
  
  if (length(tif_files) == 0) {
    cat("⚠️ No TIF files found in folder ", year_dir, "\n")
    next
  }
  
  cat("Scanning ", year, ": ", length(tif_files), " files\n")
  
  # Extract DOY
  for (file in tif_files) {
    bn <- basename(file)
    
    # Try multiple filename patterns
    doy <- NA_integer_
    
    # Pattern 1: SM_YYYYDOY.tif or SMYYYYDOY.tif
    m <- stringr::str_match(bn, "^SM[_]?(\\d{4})(\\d{3})(?:\\.tif)?$")
    if (!is.na(m[1,1])) {
      file_year <- as.integer(m[1,2])
      doy <- as.integer(m[1,3])
    } else {
      # Pattern 2: other possible patterns
      m <- stringr::str_match(bn, "(\\d{4})(\\d{3})(?:\\.tif)?$")
      if (!is.na(m[1,1])) {
        file_year <- as.integer(m[1,2])
        doy <- as.integer(m[1,3])
      } else {
        # Pattern 3: direct 7-digit (YYYYDOY)
        m <- stringr::str_match(bn, "(\\d{7})(?:\\.tif)?$")
        if (!is.na(m[1,1])) {
          file_year <- as.integer(substr(m[1,2], 1, 4))
          doy <- as.integer(substr(m[1,2], 5, 7))
        }
      }
    }
    
    # Validate year and DOY
    if (!is.na(doy) && doy >= 1 && doy <= 366 && file_year == year) {
      all_files <- c(all_files, file)
      all_dates <- c(all_dates, sprintf("%04d%03d", file_year, doy))
    } else {
      cat("⚠️ Skipping file (failed to parse date): ", bn, "\n")
    }
  }
}

if (length(all_files) == 0) {
  stop("No valid 2003–2022 soil moisture files were found.")
}

# Sort by date
date_order <- order(all_dates)
all_files <- all_files[date_order]
all_dates <- all_dates[date_order]

cat("Found ", length(all_files), " soil moisture files, time span: ",
    min(all_dates), " - ", max(all_dates), "\n", sep = "")

# Show the first few files as examples
cat("First 5 files:\n")
for (i in 1:min(5, length(all_files))) {
  cat(sprintf("  %s: %s\n", all_dates[i], basename(all_files[i])))
}

# ------------------ Efficient extraction of long soil moisture time series ------------------
# Use the first file to determine projection
r0 <- rast(all_files[1])
pts_proj <- st_transform(pts, crs(r0))
vpts <- vect(pts_proj)

# Chunked processing to avoid memory overflow
chunk_size <- 100  # process 100 files at a time
n_chunks <- ceiling(length(all_files) / chunk_size)

cat("Starting soil moisture extraction (", n_chunks, " chunks)...\n", sep = "")

sm_matrix <- matrix(NA, nrow = nrow(pts), ncol = length(all_files))

for (chunk in 1:n_chunks) {
  start_idx <- (chunk - 1) * chunk_size + 1
  end_idx <- min(chunk * chunk_size, length(all_files))
  chunk_files <- all_files[start_idx:end_idx]
  
  cat(sprintf("Processing chunk %d/%d: files %d-%d (dates: %s to %s)\
